import requests
import json
import datetime
import sys
import os
from tqdm import tqdm
from bs4 import BeautifulSoup

def excuteSpider(url,headers,session,):

    req = session.get(url, headers=headers)    
    req.encoding = "utf-8"    
    bsObj = BeautifulSoup(req.text, 'html.parser')
    rankList = bsObj.findAll("strong", {"class": "col-red02"})    
    linkList = bsObj.findAll("span", {"class": "col-gray"})    
    nameList = bsObj.findAll("a", {"class": "pr10 fz14"})    
    infoList = bsObj.findAll("p", {"class": "RtCInfo"})       
    lranksub = []    
    llinksub = []    
    lnamesub = []    
    lnfosub = []
    
    for rank in rankList:        
        lranksub.append(rank.get_text())

    for link in linkList:        
        if link.get_text().find('(') == -1:             
            llinksub.append(link.get_text())
            
    for name in nameList:        
        lnamesub.append(name.get_text())
              
    for info in infoList:        
        s=info.get_text().split('：')        
        lnfosub.append(s[1])
                
    return lranksub, llinksub, lnamesub, lnfosub
    
if __name__=='__main__':

    lrank = []    
    llink = []    
    lname = []    
    lnfo = []
    session = requests.Session()    
    
    headers = {        
        "Accept": "application/json, text/javascript, */*; q=0.01",        "Accept-Encoding": "gzip, deflate, br",        
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",        
        "Cache-Control": "no-cache",        
        "Connection": "keep-alive",        
        "Content-Type":"application/x-www-form-urlencoded",       
        "Cookie": "Hm_lvt_e2d6533b8d3c86a8202250d4989a2fe5=1594263790; UM_distinctid=1733186e35dd07-066df823586632-7a1437-e1000-1733186e35ecbe; PHPSESSID=qohkl9cgabsv14f7jatrl1sl42; exi_query_history=IxGjvC2Bm3NLwvy4Ox7IfCz7gKGkEcjHRWkIemRtAGmDhK5e4qLILAxGJWCr-D8Kgah4qhzBsotZIJTYc5jTpq63nCfBRMk176e4Fdm3D6ymNGYy18XvnqDXYq0BVIka-GrurNBIsAbnF0IN67JA2rID3YWrxyjV-GNxQvBLh4VneY-L; exi_users=PTXl5X0WGo6-F9-FwneiCpyphXanNLejfpxEvRZEfDRVotQMletHig6hXEmBnKANQPXVDi3UM48PgVHwAhus-FKHQKfiE2rEjBpVuUFyD8w5nf-FjaIvI-ENMXXpBlyqINgTY6ljhwmRl5nxbslFoPdFz58NfJjrpLjoJhV-Exb1Ltnfrrk2J1WyYTdCeEbdperVl4pcB2uiQCSI8BSjeJ-EitboSzQ0-FqdUmc61z9aoYNkLlq7DIBsi33V5EIQDWHbLWqnqhS8FDF3es6r5Fl7SgUUmWQ-EXhF5AKPH6vMIXzmQQOZCYVM548Qz4W9qqGA5aptXzE0-E2R-Fv5RlXH8WXVUgjQdeRMkTo8FqwKTtj9EuRF2ZFFHnt-ExHu5bmIMPiYXcF-FX3V5vHIstJtub2-E-FIhuKTyKRHwgxjqmEVu1jtsv1HH4-O; CNZZDATA1276361993=1264455532-1594260046-%7C1594346791; Hm_lpvt_e2d6533b8d3c86a8202250d4989a2fe5=1594347558",        
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2"    
    }
    
    pages = int(sys.argv[1])
    
    for i in tqdm(range(1, pages+1), ncols=50):        
        if i==1:            
            url = "https://top.chinaz.com/all/index_br.html"        
        else:            
            url = "https://top.chinaz.com/all/index_br_"+str(i)+".html"
    
        lranksub, llinksub, lnamesub, lnfosub = excuteSpider(url, headers, session)
        lrank += lranksub        
        llink += llinksub        
        lname += lnamesub        
        lnfo += lnfosub
    
        jsontext = {"version":0,"domains":[]}

    for i in range(len(lrank)):        
        jsontext["domains"].append({"id":lrank[i],"domain":llink[i],"name":lname[i],"description":lnfo[i]})

    work_dir = sys.path[0]    
    now = datetime.datetime.now()    
    name = datetime.datetime.strftime(now,'%Y%m%d%H%M%S.json')    
    chinazfilepath = os.path.join(work_dir, name)       
    wf = open(chinazfilepath, 'w')
    wf.write(json.dumps(jsontext, indent=4, ensure_ascii=False))    
    wf.close()
    print("ok")